{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## This notebook is used to generate the finalized version of the classifier, to simply feature transformation into the final form, and to test that the results are the same\n",
    "\n",
    "Most of the code comes from operational_classifier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle\n",
    "import sys\n",
    "#reload(sys)\n",
    "#sys.setdefaultencoding(\"utf-8\")\n",
    "\n",
    "#Loading raw data\n",
    "df = pickle.load(open(\"../Data/multiclass_tweets_indexed.p\",'rb'))\n",
    "tweets = df.text"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Feature generation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import nltk\n",
    "from nltk.stem.porter import *\n",
    "import string\n",
    "import re\n",
    "\n",
    "stopwords=stopwords = nltk.corpus.stopwords.words(\"english\")\n",
    "\n",
    "other_exclusions = [\"#ff\", \"ff\", \"rt\"]\n",
    "stopwords.extend(other_exclusions)\n",
    "\n",
    "stemmer = PorterStemmer()\n",
    "\n",
    "\n",
    "def preprocess(text_string):\n",
    "    \"\"\"\n",
    "    Accepts a text string and replaces:\n",
    "    1) urls with URLHERE\n",
    "    2) lots of whitespace with one instance\n",
    "    3) mentions with MENTIONHERE\n",
    "\n",
    "    This allows us to get standardized counts of urls and mentions\n",
    "    Without caring about specific people mentioned\n",
    "    \"\"\"\n",
    "    space_pattern = '\\s+'\n",
    "    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'\n",
    "        '[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n",
    "    mention_regex = '@[\\w\\-]+'\n",
    "    parsed_text = re.sub(space_pattern, ' ', text_string)\n",
    "    parsed_text = re.sub(giant_url_regex, '', parsed_text)\n",
    "    parsed_text = re.sub(mention_regex, '', parsed_text)\n",
    "    #parsed_text = parsed_text.code(\"utf-8\", errors='ignore')\n",
    "    return parsed_text\n",
    "\n",
    "def tokenize(tweet):\n",
    "    \"\"\"Removes punctuation & excess whitespace, sets to lowercase,\n",
    "    and stems tweets. Returns a list of stemmed tokens.\"\"\"\n",
    "    tweet = \" \".join(re.split(\"[^a-zA-Z]*\", tweet.lower())).strip()\n",
    "    #tokens = re.split(\"[^a-zA-Z]*\", tweet.lower())\n",
    "    tokens = [stemmer.stem(t) for t in tweet.split()]\n",
    "    return tokens\n",
    "\n",
    "def basic_tokenize(tweet):\n",
    "    \"\"\"Same as tokenize but without the stemming\"\"\"\n",
    "    tweet = \" \".join(re.split(\"[^a-zA-Z.,!?]*\", tweet.lower())).strip()\n",
    "    return tweet.split()\n",
    "\n",
    "vectorizer = TfidfVectorizer(\n",
    "    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(\n",
    "    tokenizer=tokenize,\n",
    "    preprocessor=preprocess,\n",
    "    ngram_range=(1, 3),\n",
    "    stop_words=stopwords, #We do better when we keep stopwords\n",
    "    use_idf=True,\n",
    "    smooth_idf=False,\n",
    "    norm=None, #Applies l2 norm smoothing\n",
    "    decode_error='replace',\n",
    "    max_features=10000,\n",
    "    min_df=5,\n",
    "    max_df=0.501\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Construct tfidf matrix and get relevant scores\n",
    "tfidf = vectorizer.fit_transform(tweets).toarray()\n",
    "vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}\n",
    "idf_vals = vectorizer.idf_\n",
    "idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Get POS tags for tweets and save as a string\n",
    "tweet_tags = []\n",
    "for t in tweets:\n",
    "    tokens = basic_tokenize(preprocess(t))\n",
    "    tags = nltk.pos_tag(tokens)\n",
    "    tag_list = [x[1] for x in tags]\n",
    "    #for i in range(0, len(tokens)):\n",
    "    tag_str = \" \".join(tag_list)\n",
    "    tweet_tags.append(tag_str)\n",
    "        #print(tokens[i],tag_list[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#We can use the TFIDF vectorizer to get a token matrix for the POS tags\n",
    "pos_vectorizer = TfidfVectorizer(\n",
    "    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(\n",
    "    tokenizer=None,\n",
    "    lowercase=False,\n",
    "    preprocessor=None,\n",
    "    ngram_range=(1, 3),\n",
    "    stop_words=None, #We do better when we keep stopwords\n",
    "    use_idf=False,\n",
    "    smooth_idf=False,\n",
    "    norm=None, #Applies l2 norm smoothing\n",
    "    decode_error='replace',\n",
    "    max_features=5000,\n",
    "    min_df=5,\n",
    "    max_df=0.501,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Construct POS TF matrix and get vocab dict\n",
    "pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()\n",
    "pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Now get other features\n",
    "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS\n",
    "from textstat.textstat import *\n",
    "\n",
    "sentiment_analyzer = VS()\n",
    "\n",
    "def count_twitter_objs(text_string):\n",
    "    \"\"\"\n",
    "    Accepts a text string and replaces:\n",
    "    1) urls with URLHERE\n",
    "    2) lots of whitespace with one instance\n",
    "    3) mentions with MENTIONHERE\n",
    "    4) hashtags with HASHTAGHERE\n",
    "\n",
    "    This allows us to get standardized counts of urls and mentions\n",
    "    Without caring about specific people mentioned.\n",
    "    \n",
    "    Returns counts of urls, mentions, and hashtags.\n",
    "    \"\"\"\n",
    "    space_pattern = '\\s+'\n",
    "    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'\n",
    "        '[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')\n",
    "    mention_regex = '@[\\w\\-]+'\n",
    "    hashtag_regex = '#[\\w\\-]+'\n",
    "    parsed_text = re.sub(space_pattern, ' ', text_string)\n",
    "    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)\n",
    "    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)\n",
    "    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)\n",
    "    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))\n",
    "\n",
    "def other_features(tweet):\n",
    "    \"\"\"This function takes a string and returns a list of features.\n",
    "    These include Sentiment scores, Text and Readability scores,\n",
    "    as well as Twitter specific features\"\"\"\n",
    "    ##SENTIMENT\n",
    "    sentiment = sentiment_analyzer.polarity_scores(tweet)\n",
    "    \n",
    "    words = preprocess(tweet) #Get text only\n",
    "    \n",
    "    syllables = textstat.syllable_count(words) #count syllables in words\n",
    "    num_chars = sum(len(w) for w in words) #num chars in words\n",
    "    num_chars_total = len(tweet)\n",
    "    num_terms = len(tweet.split())\n",
    "    num_words = len(words.split())\n",
    "    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)\n",
    "    num_unique_terms = len(set(words.split()))\n",
    "    \n",
    "    ###Modified FK grade, where avg words per sentence is just num words/1\n",
    "    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)\n",
    "    ##Modified FRE score, where sentence fixed to 1\n",
    "    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)\n",
    "    \n",
    "    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://\n",
    "    retweet = 0\n",
    "    if \"rt\" in words:\n",
    "        retweet = 1\n",
    "    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,\n",
    "                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],\n",
    "                twitter_objs[2], twitter_objs[1],\n",
    "                twitter_objs[0], retweet]\n",
    "    #features = pandas.DataFrame(features)\n",
    "    return features\n",
    "\n",
    "def get_feature_array(tweets):\n",
    "    feats=[]\n",
    "    for t in tweets:\n",
    "        feats.append(other_features(t))\n",
    "    return np.array(feats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "other_features_names = [\"FKRA\", \"FRE\",\"num_syllables\", \"avg_syl_per_word\", \"num_chars\", \"num_chars_total\", \\\n",
    "                        \"num_terms\", \"num_words\", \"num_unique_words\", \"vader neg\",\"vader pos\",\"vader neu\", \"vader compound\", \\\n",
    "                        \"num_hashtags\", \"num_mentions\", \"num_urls\", \"is_retweet\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "feats = get_feature_array(tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Now join them all up\n",
    "M = np.concatenate([tfidf,pos,feats],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(24802, 11171)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "M.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Finally get a list of variable names\n",
    "variables = ['']*len(vocab)\n",
    "for k,v in vocab.iteritems():\n",
    "    variables[v] = k\n",
    "\n",
    "pos_variables = ['']*len(pos_vocab)\n",
    "for k,v in pos_vocab.iteritems():\n",
    "    pos_variables[v] = k\n",
    "\n",
    "feature_names = variables+pos_variables+other_features_names"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Running the model\n",
    "\n",
    "This model was found using a GridSearch with 5-fold cross validation. Details are in the notebook operational_classifier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "X = pd.DataFrame(M)\n",
    "y = df['class'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.svm import LinearSVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "select = SelectFromModel(LogisticRegression(class_weight='balanced',penalty=\"l1\",C=0.01))\n",
    "X_ = select.fit_transform(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "model = LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "y_preds = model.predict(X_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "report = classification_report( y, y_preds )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.46      0.57      0.51      1431\n",
      "          1       0.97      0.92      0.94     19206\n",
      "          2       0.82      0.96      0.89      4165\n",
      "\n",
      "avg / total       0.91      0.90      0.91     24802\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Using information from the model to obtain the matrix X_ generically\n",
    "\n",
    "This is the most difficult task: We have to take the inputs tweets and transform them into a format that can be used in the model without going through all the same pre-processing steps as above. This can be done as follows."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Obtaining information about the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "final_features = select.get_support(indices=True) #get indices of features\n",
    "final_feature_list = [unicode(feature_names[i]) for i in final_features] #Get list of names corresponding to indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[u'america', u'american', u'anoth', u'ass', u'ass cracker', u'ass hoe', u'ass nigga', u'bad', u'beaner', u'big', u'bird', u'bitch', u'bitch nigga', u'black', u'border', u'born', u'bout', u'browni', u'campu', u'charli', u'chill', u'chink', u'color', u'color folk', u'coon', u'countri', u'cracker', u'crazi', u'crippl', u'cunt', u'da', u'damn', u'darki', u'dick', u'die', u'doe', u'dyke', u'fag', u'faggot', u'fat', u'femal', u'feminist', u'filth', u'first', u'folk', u'fucc nicca', u'fuck', u'fuckin', u'game', u'gay', u'get', u'girl', u'gon', u'gook', u'got nigga', u'hate', u'hate hoe', u'hi', u'hire', u'ho', u'hoe', u'hood', u'human', u'israel', u'jap', u'jew', u'jihadi', u'kill', u'lame', u'latina', u'let', u'like', u'lol', u'look like', u'love', u'may', u'mexican', u'mock', u'money', u'monkey', u'muslim', u'muzzi', u'negro', u'nicca', u'nig', u'nigga', u'nigga bitch', u'niggah', u'niggaz', u'nigger', u'nigguh', u'niglet', u'oreo', u'peopl', u'pick', u'play', u'pussi', u'queer', u'race', u'racist', u'real', u'real nigga', u'redneck', u'retard', u'say', u'sex', u'shit', u'shoot', u'show', u'shut', u'side', u'slope', u'smh', u'sole', u'special', u'spic', u'stfu', u'stupid', u'suck', u'tcot', u'teabagg', u'teach', u'thank', u'thi pussi', u'tho', u'throat', u'towel', u'trailer', u'tranni', u'trash', u'tri', u'twat', u'u', u'ugli', u'uncl', u'ur', u'us', u'use', u'wassup', u'watch', u'wear', u'wetback', u'whi', u'white', u'white trash', u'whitey', u'whore', u'word', u'ya', u'yanke', u'yeah', u'yellow', u'yr', u'IN NN', u'NN NN NN', u'PRP VBP', u'RB', u'VB', u'VBD', u'FKRA', u'FRE', u'num_syllables', u'num_chars', u'num_chars_total', u'num_terms', u'num_words', u'num_unique_words', u'vader compound', u'num_hashtags', u'num_mentions']\n"
     ]
    }
   ],
   "source": [
    "print final_feature_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Getting names for each class of features\n",
    "ngram_features = final_feature_list[:final_feature_list.index('yr')+1]\n",
    "pos_features = final_feature_list[final_feature_list.index('yr')+1:final_feature_list.index('VBD')+1]\n",
    "oth_features = final_feature_list[final_feature_list.index('VBD')+1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Generating ngram features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "new_vocab = {v:i for i, v in enumerate(ngram_features)}\n",
    "new_vocab_to_index = {}\n",
    "for k in ngram_features:\n",
    "    new_vocab_to_index[k] = vocab[k]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Get indices of text features\n",
    "ngram_indices = final_features[:len(ngram_features)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#TODO: Pickle new vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "new_vectorizer = TfidfVectorizer(\n",
    "    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(\n",
    "    tokenizer=tokenize,\n",
    "    preprocessor=preprocess,\n",
    "    ngram_range=(1, 3),\n",
    "    stop_words=stopwords, #We do better when we keep stopwords\n",
    "    use_idf=False,\n",
    "    smooth_idf=False,\n",
    "    norm=None, #Applies l2 norm smoothing\n",
    "    decode_error='replace',\n",
    "    min_df=1,\n",
    "    max_df=1.0,\n",
    "    vocabulary=new_vocab\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['final_tfidf.pkl']"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.externals import joblib\n",
    "joblib.dump(new_vectorizer, 'final_tfidf.pkl') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tfidf_ = new_vectorizer.fit_transform(tweets).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#Verifying that results are the same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
       "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_[1,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.0"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_[1,:].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  4.81058113,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        2.81715792,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "        0.        ,  0.        ,  0.        ])"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_[1,:tfidf_.shape[1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7.6277390516751762"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_[1,:tfidf_.shape[1]].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Results are the same if use IDF but the problem is that IDF will be different if we use different data. Instead we have to use the original IDF scores and multiply them by the new matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "idf_vals_ = idf_vals[ngram_indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(153,)"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idf_vals_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['final_idf.pkl']"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#TODO: Pickle idf_vals\n",
    "\n",
    "joblib.dump(idf_vals_, 'final_idf.pkl') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True], dtype=bool)"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(tfidf_[1,:]*idf_vals_) == X_[1,:153] #Got same value as final process array!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       ..., \n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_*idf_vals_ == X_[:,:153]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tfidffinal = tfidf_*idf_vals_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Generating POS features\n",
    "This is simpler as we do not need to worry about IDF but it will be slower as we have to compute the POS tags for the new data. Here we can simply use the old POS tags."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "new_pos = {v:i for i, v in enumerate(pos_features)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "#TODO: Pickle pos vectorizer\n",
    "#We can use the TFIDF vectorizer to get a token matrix for the POS tags\n",
    "new_pos_vectorizer = TfidfVectorizer(\n",
    "    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(\n",
    "    tokenizer=None,\n",
    "    lowercase=False,\n",
    "    preprocessor=None,\n",
    "    ngram_range=(1, 3),\n",
    "    stop_words=None, #We do better when we keep stopwords\n",
    "    use_idf=False,\n",
    "    smooth_idf=False,\n",
    "    norm=None, #Applies l2 norm smoothing\n",
    "    decode_error='replace',\n",
    "    min_df=1,\n",
    "    max_df=1.0,\n",
    "    vocabulary=new_pos\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['final_pos.pkl']"
      ]
     },
     "execution_count": 182,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(new_pos_vectorizer, 'final_pos.pkl') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "pos_ = new_pos_vectorizer.fit_transform(tweet_tags).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.,  1.,  0.,  0.,  0.,  0.])"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos_[1,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.,  1.,  0.,  0.,  0.,  0.])"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_[1,153:159]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ True,  True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True],\n",
       "       ..., \n",
       "       [ True,  True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True],\n",
       "       [ True,  True,  True,  True,  True,  True]], dtype=bool)"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos_[:,:] == X_[:,153:159]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61686.0"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pos_[:,:].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "61686.0"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_[:,153:159].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Finally, we can look at the other features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neg', 'vader pos', 'vader neu', 'vader compound', 'num_hashtags', 'num_mentions', 'num_urls', 'is_retweet']\n"
     ]
    }
   ],
   "source": [
    "print other_features_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[u'FKRA', u'FRE', u'num_syllables', u'num_chars', u'num_chars_total', u'num_terms', u'num_words', u'num_unique_words', u'vader compound', u'num_hashtags', u'num_mentions']\n"
     ]
    }
   ],
   "source": [
    "print oth_features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The functions can be modified to only calculate and return necessary fields."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def other_features_(tweet):\n",
    "    \"\"\"This function takes a string and returns a list of features.\n",
    "    These include Sentiment scores, Text and Readability scores,\n",
    "    as well as Twitter specific features\"\"\"\n",
    "    ##SENTIMENT\n",
    "    sentiment = sentiment_analyzer.polarity_scores(tweet)\n",
    "    \n",
    "    words = preprocess(tweet) #Get text only\n",
    "    \n",
    "    syllables = textstat.syllable_count(words) #count syllables in words\n",
    "    num_chars = sum(len(w) for w in words) #num chars in words\n",
    "    num_chars_total = len(tweet)\n",
    "    num_terms = len(tweet.split())\n",
    "    num_words = len(words.split())\n",
    "    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)\n",
    "    num_unique_terms = len(set(words.split()))\n",
    "    \n",
    "    ###Modified FK grade, where avg words per sentence is just num words/1\n",
    "    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)\n",
    "    ##Modified FRE score, where sentence fixed to 1\n",
    "    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)\n",
    "    \n",
    "    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://\n",
    "    features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words,\n",
    "                num_unique_terms, sentiment['compound'],\n",
    "                twitter_objs[2], twitter_objs[1],]\n",
    "    #features = pandas.DataFrame(features)\n",
    "    return features\n",
    "\n",
    "def get_feature_array_(tweets):\n",
    "    feats=[]\n",
    "    for t in tweets:\n",
    "        feats.append(other_features_(t))\n",
    "    return np.array(feats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "feats_ = get_feature_array_(tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([   6.5   ,   93.14  ,   26.1   ,  127.    ,  140.    ,   25.    ,\n",
       "         25.    ,   23.    ,    0.4563,    0.    ,    1.    ])"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feats_[0,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([   6.5   ,   93.14  ,   26.1   ,  127.    ,  140.    ,   25.    ,\n",
       "         25.    ,   23.    ,    0.4563,    0.    ,    1.    ])"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_[0,159:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       ..., \n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True],\n",
       "       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feats_[:,:] == X_[:,159:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Now that we have put it all together using a simplified process we can assess if these new data return the same answers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "M_ = np.concatenate([tfidffinal, pos_, feats_],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(24802, 170)"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "M_.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "X__ = pd.DataFrame(M_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "y_preds_ = model.predict(X__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "report = classification_report( y, y_preds_ )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.46      0.57      0.51      1431\n",
      "          1       0.97      0.92      0.94     19206\n",
      "          2       0.82      0.96      0.89      4165\n",
      "\n",
      "avg / total       0.91      0.90      0.91     24802\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "OK. So now that we have verified that the results are the same with X_ and X__ we can implement a script that can transform new data in this manner."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
